UFO Sighting Analysis¶
InĀ [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
# Set style for better-looking plots
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
InĀ [4]:
# Load and explore the dataset
df = pd.read_csv('UFO_Sightings_df.csv') # Replace with your actual file path
# Display basic information about the dataset
print("Dataset Shape:", df.shape)
print("\nColumn Names:")
print(df.columns.tolist())
print("\nData Types:")
print(df.dtypes)
print("\nFirst 5 rows:")
df.head()
Dataset Shape: (88875, 12) Column Names: ['datetime', 'city', 'state', 'country', 'shape', 'duration (seconds)', 'duration (hours/min)', 'comments', 'date posted', 'latitude', 'longitude', 'Unnamed: 11'] Data Types: datetime object city object state object country object shape object duration (seconds) object duration (hours/min) object comments object date posted object latitude object longitude float64 Unnamed: 11 float64 dtype: object First 5 rows:
Out[4]:
datetime | city | state | country | shape | duration (seconds) | duration (hours/min) | comments | date posted | latitude | longitude | Unnamed: 11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10/10/1949 20:30 | san marcos | tx | us | cylinder | 2700 | 45 minutes | This event took place in early fall around 194... | 4/27/2004 | 29.8830556 | -97.941111 | NaN |
1 | 10/10/1949 21:00 | lackland afb | tx | NaN | light | 7200 | 1-2 hrs | 1949 Lackland AFB, TX. Lights racing acros... | 12/16/2005 | 29.38421 | -98.581082 | NaN |
2 | 10/10/1955 17:00 | chester (uk/england) | NaN | gb | circle | 20 | 20 seconds | Green/Orange circular disc over Chester, En... | 1/21/2008 | 53.2 | -2.916667 | NaN |
3 | 10/10/1956 21:00 | edna | tx | us | circle | 20 | 1/2 hour | My older brother and twin sister were leaving ... | 1/17/2004 | 28.9783333 | -96.645833 | NaN |
4 | 10/10/1960 20:00 | kaneohe | hi | us | light | 900 | 15 minutes | AS a Marine 1st Lt. flying an FJ4B fighter/att... | 1/22/2004 | 21.4180556 | -157.803611 | NaN |
InĀ [5]:
# Data Cleaning and Preparation
# Convert datetime columns
df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')
df['date posted'] = pd.to_datetime(df['date posted'], errors='coerce')
# Extract time features for analysis
df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.month
df['day'] = df['datetime'].dt.day
df['hour'] = df['datetime'].dt.hour
df['dayofweek'] = df['datetime'].dt.day_name()
df['dayofweek_num'] = df['datetime'].dt.dayofweek
df['is_weekend'] = df['dayofweek'].isin(['Friday', 'Saturday', 'Sunday'])
# Clean duration column - convert to numeric
df['duration_seconds'] = pd.to_numeric(df['duration (seconds)'], errors='coerce')
# Remove rows with missing critical data
df_clean = df.dropna(subset=['datetime', 'city', 'state'])
# Filter for reasonable years (1950-2023)
df_clean = df_clean[(df_clean['year'] >= 1950) & (df_clean['year'] <= 2023)]
print(f"Original dataset: {len(df)} rows")
print(f"Cleaned dataset: {len(df_clean)} rows")
print(f"Removed: {len(df) - len(df_clean)} rows ({((len(df) - len(df_clean))/len(df)*100):.2f}%)")
Original dataset: 88875 rows Cleaned dataset: 80152 rows Removed: 8723 rows (9.81%)
InĀ [6]:
# Plot 1 - The Weekend Effect (Bar Plot)
# This plot will highlight the weekend in orange, the weekdays in blue. Then we will be able to see if more sightings occur in orange regions.
plt.figure(figsize=(12, 6))
# Order days properly
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
day_counts = df_clean['dayofweek'].value_counts().reindex(day_order)
# Create bar plot with different colors for weekends
colors = ['skyblue', 'skyblue', 'skyblue', 'skyblue', 'lightcoral', 'lightcoral', 'lightcoral']
bars = plt.bar(day_order, day_counts.values, color=colors)
# Add value labels on bars
for bar in bars:
height = bar.get_height()
plt.text(bar.get_x() + bar.get_width()/2., height,
f'{int(height):,}',
ha='center', va='bottom')
plt.title('UFO Sightings by Day of Week: The Weekend Effect', fontsize=16, fontweight='bold')
plt.xlabel('Day of Week', fontsize=12)
plt.ylabel('Number of Sightings', fontsize=12)
plt.xticks(rotation=45)
# Add legend
from matplotlib.patches import Patch
legend_elements = [Patch(facecolor='skyblue', label='Weekday'),
Patch(facecolor='lightcoral', label='Weekend')]
plt.legend(handles=legend_elements, loc='upper right')
plt.tight_layout()
plt.show()
# Calculate weekend vs weekday statistics
weekend_sightings = df_clean[df_clean['is_weekend']].shape[0]
weekday_sightings = df_clean[~df_clean['is_weekend']].shape[0]
weekend_avg = weekend_sightings / 3 # 3 weekend days
weekday_avg = weekday_sightings / 4 # 4 weekdays
print(f"\nWeekend Analysis:")
print(f"Total weekend sightings: {weekend_sightings:,}")
print(f"Total weekday sightings: {weekday_sightings:,}")
print(f"Average per weekend day: {weekend_avg:,.0f}")
print(f"Average per weekday: {weekday_avg:,.0f}")
print(f"Weekend boost: {((weekend_avg - weekday_avg) / weekday_avg * 100):.1f}%")
Weekend Analysis: Total weekend sightings: 37,284 Total weekday sightings: 42,868 Average per weekend day: 12,428 Average per weekday: 10,717 Weekend boost: 16.0%
InĀ [7]:
# Plot 2 - Geographic Distribution (Horizontal Bar Plot)
# This plot will show the top US states in which UFO sightings where reported
# Filter for US sightings and get top 20 states
us_sightings = df_clean[df_clean['country'] == 'us'].copy()
top_states = us_sightings['state'].value_counts().head(20)
plt.figure(figsize=(10, 12))
plt.barh(top_states.index, top_states.values, color='darkgreen')
# Add value labels
for i, v in enumerate(top_states.values):
plt.text(v + 50, i, f'{v:,}', va='center')
plt.title('Top 20 US States by UFO Sightings', fontsize=16, fontweight='bold')
plt.xlabel('Number of Sightings', fontsize=12)
plt.ylabel('State', fontsize=12)
plt.tight_layout()
plt.show()
print(f"Top 5 UFO Hotspots:")
for i, (state, count) in enumerate(top_states.head().items(), 1):
print(f"{i}. {state.upper()}: {count:,} sightings")
Top 5 UFO Hotspots: 1. CA: 9,461 sightings 2. WA: 4,238 sightings 3. FL: 4,116 sightings 4. TX: 3,682 sightings 5. NY: 3,191 sightings
InĀ [55]:
# Plot 3 - Sightings Over Time (Line Plot with Date Splitting)
# This plot will show the recorded UFO sightings over time from 1950-2023
# Group by year and count sightings
yearly_sightings = df_clean.groupby('year').size()
plt.figure(figsize=(14, 7))
plt.plot(yearly_sightings.index, yearly_sightings.values, linewidth=2, color='purple')
plt.fill_between(yearly_sightings.index, yearly_sightings.values, alpha=0.3, color='purple')
# Mark significant years
plt.axvline(x=1969, color='red', linestyle='--', alpha=0.5, label='Moon Landing')
plt.axvline(x=1989, color='orange', linestyle='--', alpha=0.5, label='Digital Camera Commercial Release')
plt.axvline(x=2000, color='green', linestyle='--', alpha=0.5, label='The First Mass-Market Camera Phone')
plt.title('UFO Sightings Over Time (1950-2023)', fontsize=16, fontweight='bold')
plt.xlabel('Year', fontsize=12)
plt.ylabel('Number of Sightings', fontsize=12)
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# Find peak years
peak_years = yearly_sightings.nlargest(5)
print("Peak UFO Sighting Years:")
for year, count in peak_years.items():
print(f"{year}: {count:,} sightings")
Peak UFO Sighting Years: 2012.0: 7,470 sightings 2013.0: 7,228 sightings 2011.0: 5,199 sightings 2008.0: 4,735 sightings 2009.0: 4,378 sightings
InĀ [9]:
# Plot 4 - Time of Day Analysis (Histogram with Weekend Filter)
# This plot will show the time of day in which reports were given and then compare weekday vs weekend reports for time.
# Filter for weekend vs weekday
weekend_hours = df_clean[df_clean['is_weekend']]['hour']
weekday_hours = df_clean[~df_clean['is_weekend']]['hour']
plt.figure(figsize=(12, 6))
bins = range(0, 25)
plt.hist([weekday_hours, weekend_hours], bins=bins, label=['Weekday', 'Weekend'],
alpha=0.7, color=['blue', 'red'], edgecolor='black')
plt.title('UFO Sightings by Hour of Day: Weekend vs Weekday', fontsize=16, fontweight='bold')
plt.xlabel('Hour of Day (24-hour format)', fontsize=12)
plt.ylabel('Number of Sightings', fontsize=12)
plt.legend()
plt.xticks(range(0, 24))
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()
# Calculate late night sightings (10 PM - 4 AM)
late_night = df_clean[(df_clean['hour'] >= 22) | (df_clean['hour'] <= 4)]
late_night_weekend = late_night[late_night['is_weekend']].shape[0]
late_night_weekday = late_night[~late_night['is_weekend']].shape[0]
print(f"\nLate Night Analysis (10 PM - 4 AM):")
print(f"Weekend late night sightings: {late_night_weekend:,}")
print(f"Weekday late night sightings: {late_night_weekday:,}")
print(f"Ratio: {late_night_weekend/late_night_weekday:.2f}x more on weekends")
Late Night Analysis (10 PM - 4 AM): Weekend late night sightings: 15,498 Weekday late night sightings: 16,579 Ratio: 0.93x more on weekends
InĀ [10]:
# Plot 5 - UFO Shapes (Pie Chart)
# This plot will show the reported UFO shapes
# Get top 10 shapes
top_shapes = df_clean['shape'].value_counts().head(10)
plt.figure(figsize=(10, 8))
colors = plt.cm.Set3(range(len(top_shapes)))
plt.pie(top_shapes.values, labels=top_shapes.index, autopct='%1.1f%%',
colors=colors, startangle=90)
plt.title('Top 10 Most Reported UFO Shapes', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()
print("Top 5 UFO Shapes:")
for shape, count in top_shapes.head().items():
print(f"{shape.capitalize()}: {count:,} sightings ({count/len(df_clean)*100:.1f}%)")
Top 5 UFO Shapes: Light: 16,385 sightings (20.4%) Triangle: 7,853 sightings (9.8%) Circle: 7,512 sightings (9.4%) Fireball: 6,057 sightings (7.6%) Unknown: 5,758 sightings (7.2%)
InĀ [11]:
# Plot 6 - Seasonal Pattern (Bar Plot with Month Splitting)
# This plot will show UFO reports by seasons in the USA
monthly_sightings = df_clean.groupby('month').size()
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
plt.figure(figsize=(12, 6))
# Color by season
colors = ['lightblue' if m in [12, 1, 2] else 'lightgreen' if m in [3, 4, 5]
else 'yellow' if m in [6, 7, 8] else 'orange'
for m in range(1, 13)]
bars = plt.bar(month_names, monthly_sightings.values, color=colors, edgecolor='black')
# Add value labels
for bar in bars:
height = bar.get_height()
plt.text(bar.get_x() + bar.get_width()/2., height,
f'{int(height):,}',
ha='center', va='bottom')
plt.title('UFO Sightings by Month: Seasonal Patterns', fontsize=16, fontweight='bold')
plt.xlabel('Month', fontsize=12)
plt.ylabel('Number of Sightings', fontsize=12)
# Add season legend
from matplotlib.patches import Patch
legend_elements = [Patch(facecolor='lightblue', label='Winter'),
Patch(facecolor='lightgreen', label='Spring'),
Patch(facecolor='yellow', label='Summer'),
Patch(facecolor='orange', label='Autumn')]
plt.legend(handles=legend_elements, loc='upper right')
plt.tight_layout()
plt.show()
# Calculate seasonal statistics
winter = monthly_sightings[[12, 1, 2]].sum()
spring = monthly_sightings[[3, 4, 5]].sum()
summer = monthly_sightings[[6, 7, 8]].sum()
autumn = monthly_sightings[[9, 10, 11]].sum()
print("Seasonal Analysis:")
print(f"Winter: {winter:,} sightings")
print(f"Spring: {spring:,} sightings")
print(f"Summer: {summer:,} sightings")
print(f"Autumn: {autumn:,} sightings")
print(f"\nPeak season: Summer with {summer/len(df_clean)*100:.1f}% of all sightings")
Seasonal Analysis: Winter: 15,879 sightings Spring: 16,119 sightings Summer: 26,123 sightings Autumn: 22,031 sightings Peak season: Summer with 32.6% of all sightings
InĀ [45]:
# Plot 7 - Duration Analysis (Box Plot with Weekend Comparison)
# This plot will show how long on average the reporters UFO event lasted
# Filter for reasonable durations (less than 30 minutes = 1800 seconds for better visualization)
duration_filtered = df_clean[(df_clean['duration_seconds'] > 0) &
(df_clean['duration_seconds'] < 1800)].copy()
# Create figure
plt.figure(figsize=(10, 6))
# Box plot comparing weekend vs weekday durations
plt.boxplot([duration_filtered[~duration_filtered['is_weekend']]['duration_seconds'],
duration_filtered[duration_filtered['is_weekend']]['duration_seconds']],
labels=['Weekday', 'Weekend'],
patch_artist=True,
boxprops=dict(facecolor='lightblue', color='darkblue'),
medianprops=dict(color='red', linewidth=2),
whiskerprops=dict(color='darkblue'),
capprops=dict(color='darkblue'))
plt.title('UFO Sighting Duration: Weekend vs Weekday', fontsize=14, fontweight='bold')
plt.ylabel('Duration (seconds)', fontsize=12)
plt.grid(axis='y', alpha=0.3)
# Group into time periods for clearer visualization
duration_filtered['time_period'] = pd.cut(duration_filtered['hour'],
bins=[-1, 6, 12, 18, 24],
labels=['Night (12AM-6AM)', 'Morning (6AM-12PM)',
'Afternoon (12PM-6PM)', 'Evening (6PM-12AM)'])
# Calculate median duration by time period and weekend status
time_period_data = duration_filtered.groupby(['time_period', 'is_weekend'])['duration_seconds'].agg(['median', 'count']).reset_index()
# Show the plot
plt.show()
# Statistical summary
print("Duration Analysis Summary:")
print("=" * 50)
# Overall statistics
print(f"\nOverall Statistics (durations < 30 minutes):")
print(f"Average duration: {duration_filtered['duration_seconds'].mean():.0f} seconds")
print(f"Median duration: {duration_filtered['duration_seconds'].median():.0f} seconds")
# Weekend vs Weekday comparison
weekend_mean = duration_filtered[duration_filtered['is_weekend']]['duration_seconds'].mean()
weekday_mean = duration_filtered[~duration_filtered['is_weekend']]['duration_seconds'].mean()
weekend_median = duration_filtered[duration_filtered['is_weekend']]['duration_seconds'].median()
weekday_median = duration_filtered[~duration_filtered['is_weekend']]['duration_seconds'].median()
print(f"\nWeekend vs Weekday:")
print(f"Weekend - Mean: {weekend_mean:.0f}s, Median: {weekend_median:.0f}s")
print(f"Weekday - Mean: {weekday_mean:.0f}s, Median: {weekday_median:.0f}s")
print(f"Difference: {abs(weekend_mean - weekday_mean):.0f}s (mean), {abs(weekend_median - weekday_median):.0f}s (median)")
# Time of day analysis
print(f"\nSightings by Time Period:")
time_counts = duration_filtered['time_period'].value_counts()
for period, count in time_counts.items():
print(f"{period}: {count:,} sightings ({count/len(duration_filtered)*100:.1f}%)")
Duration Analysis Summary: ================================================== Overall Statistics (durations < 30 minutes): Average duration: 273 seconds Median duration: 120 seconds Weekend vs Weekday: Weekend - Mean: 276s, Median: 120s Weekday - Mean: 271s, Median: 120s Difference: 6s (mean), 0s (median) Sightings by Time Period: Evening (6PM-12AM): 37,171 sightings (57.1%) Night (12AM-6AM): 12,312 sightings (18.9%) Afternoon (12PM-6PM): 10,261 sightings (15.8%) Morning (6AM-12PM): 5,298 sightings (8.1%)
InĀ [50]:
# Plot 8 - Sightings by Hour and Day of Week (Heatmap)
# This plot will show the heatmap of what day and when during it reports came in
hourly_dow = df_clean.groupby(['dayofweek_num', 'hour']).size().unstack(fill_value=0)
plt.figure(figsize=(12, 8))
sns.heatmap(hourly_dow, cmap='YlOrRd', annot=True, fmt='d',
xticklabels=range(24),
yticklabels=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])
plt.title('UFO Sightings Heatmap: Hour vs Day of Week')
plt.xlabel('Hour of Day')
plt.ylabel('Day of Week')
plt.show()
The Heatmap shows Day of the Week vs Hour of Day. It shows a clear increase in sightings late at night for all days (between 1900 and 2300 is when almost all sightings occur). But it is seen that the most reports occur on saturday night on these times.
InĀ [14]:
# Plot 9 - Top UFO Hotspot Cities in the US
# This plot will show the top 20 Cities for UFO reports (different from states)
# Get top 20 cities in the US
us_cities = us_sightings.groupby(['city', 'state']).size().reset_index(name='count')
us_cities['city_state'] = us_cities['city'] + ', ' + us_cities['state'].str.upper()
top_cities = us_cities.nlargest(20, 'count')
plt.figure(figsize=(12, 8))
bars = plt.bar(range(len(top_cities)), top_cities['count'], color='darkred')
# Add value labels
for i, bar in enumerate(bars):
height = bar.get_height()
plt.text(bar.get_x() + bar.get_width()/2., height,
f'{int(height):,}',
ha='center', va='bottom', fontsize=8)
plt.title('Top 20 US Cities for UFO Sightings', fontsize=16, fontweight='bold')
plt.xlabel('City', fontsize=12)
plt.ylabel('Number of Sightings', fontsize=12)
plt.xticks(range(len(top_cities)), top_cities['city_state'], rotation=45, ha='right')
plt.tight_layout()
plt.show()
print("Top 10 UFO Hotspot Cities:")
for idx, row in top_cities.head(10).iterrows():
print(f"{row['city_state']}: {row['count']:,} sightings")
Top 10 UFO Hotspot Cities: seattle, WA: 558 sightings phoenix, AZ: 478 sightings las vegas, NV: 388 sightings los angeles, CA: 365 sightings san diego, CA: 354 sightings portland, OR: 349 sightings houston, TX: 309 sightings chicago, IL: 291 sightings tucson, AZ: 257 sightings miami, FL: 245 sightings
InĀ [28]:
# Summary Statistics and Conclusions
# Summarising all findings from the report.
print("=== UFO SIGHTINGS ANALYSIS SUMMARY ===\n")
print("1. THE WEEKEND EFFECT:")
# Recalculate weekend statistics
weekend_sightings = df_clean[df_clean['is_weekend']].shape[0]
weekday_sightings = df_clean[~df_clean['is_weekend']].shape[0]
weekend_pct = (weekend_sightings / len(df_clean)) * 100
print(f" - {weekend_pct:.1f}% of all UFO sightings occur on weekends (Fri-Sun)")
print(f" - Weekend sightings are higher than weekdays")
# Late night statistics
late_night = df_clean[(df_clean['hour'] >= 22) | (df_clean['hour'] <= 4)]
late_night_weekend = late_night[late_night['is_weekend']].shape[0]
late_night_weekday = late_night[~late_night['is_weekend']].shape[0]
if late_night_weekday > 0:
ratio = late_night_weekend/late_night_weekday
print(f" - Late night weekend sightings are {ratio:.2f}x more common\n")
else:
print(" - Late night weekend sightings: More common\n")
print("2. GEOGRAPHIC HOTSPOTS:")
# Top locations
us_sightings = df_clean[df_clean['country'] == 'us']
top_states = us_sightings['state'].value_counts()
if len(top_states) > 0:
print(f" - Top state: {top_states.index[0].upper()} with {top_states.values[0]:,} sightings")
us_pct = (len(us_sightings)/len(df_clean)*100)
print(f" - US accounts for {us_pct:.1f}% of all sightings\n")
print("3. TEMPORAL PATTERNS:")
# Yearly statistics
yearly_sightings = df_clean.groupby('year').size()
print(f" - Peak year: {yearly_sightings.idxmax()} with {yearly_sightings.max():,} sightings")
# Monthly statistics - Fixed version
monthly_sightings = df_clean.groupby('month').size()
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
# Get peak month safely
peak_month_num = int(monthly_sightings.idxmax())
peak_month_count = int(monthly_sightings.max())
peak_month_name = month_names[peak_month_num - 1] # Subtract 1 because months are 1-12 but list is 0-11
print(f" - Peak month: {peak_month_name} with {peak_month_count:,} sightings")
# Hour statistics
hour_counts = df_clean['hour'].value_counts()
if len(hour_counts) > 0:
peak_hour = int(hour_counts.index[0])
print(f" - Peak hour: {peak_hour}:00\n")
print("4. UFO CHARACTERISTICS:")
# Shape statistics
top_shapes = df_clean['shape'].value_counts()
if len(top_shapes) > 0:
print(f" - Most common shape: {top_shapes.index[0]}")
print("\n5. INTERESTING FINDINGS:")
print(" - Summer months show significantly more sightings")
print(" - Sightings dramatically increased after 1990s")
print(" - Coastal states dominate the top sighting locations")
print(" - The 'weekend effect' is real - supporting our hypothesis!")
=== UFO SIGHTINGS ANALYSIS SUMMARY === 1. THE WEEKEND EFFECT: - 46.5% of all UFO sightings occur on weekends (Fri-Sun) - Weekend sightings are higher than weekdays - Late night weekend sightings are 0.93x more common 2. GEOGRAPHIC HOTSPOTS: - Top state: CA with 9,461 sightings - US accounts for 86.6% of all sightings 3. TEMPORAL PATTERNS: - Peak year: 2012.0 with 7,470 sightings - Peak month: Jul with 9,492 sightings - Peak hour: 21:00 4. UFO CHARACTERISTICS: - Most common shape: light 5. INTERESTING FINDINGS: - Summer months show significantly more sightings - Sightings dramatically increased after 1990s - Coastal states dominate the top sighting locations - The 'weekend effect' is real - supporting our hypothesis!